knitr::opts_knit$set(root.dir = '/Users/charleshanks/desktop/msds/spring_23/ml/FARS2020NationalCSV')
library(tidyverse)
library(tidytext)
library(caret)
library(fastDummies)
library(randomForest)
acc = read_csv('accident.csv')
Rows: 35935 Columns: 81── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (34): STATENAME, COUNTYNAME, CITYNAME, MONTHNAME, DAY_WEEKNAME, HOURNAME, MINUTENAME, NHSNAME, ROUTENAME, TWAY_ID, TWAY_...
dbl (47): STATE, ST_CASE, VE_TOTAL, VE_FORMS, PVH_INVL, PEDS, PERNOTMVIT, PERMVIT, PERSONS, COUNTY, CITY, DAY, DAYNAME, MONT...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
veh = read_csv('vehicle.csv')
Warning: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
dat <- vroom(...)
problems(dat)Rows: 54552 Columns: 201── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (108): STATENAME, NUMOCCSNAME, MONTHNAME, HOURNAME, MINUTENAME, HARM_EVNAME, MAN_COLLNAME, UNITTYPENAME, HIT_RUNNAME, RE...
dbl (93): STATE, ST_CASE, VEH_NO, VE_FORMS, NUMOCCS, DAY, DAYNAME, MONTH, HOUR, MINUTE, HARM_EV, MAN_COLL, UNITTYPE, HIT_RU...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
per = read_csv('person.csv')
Rows: 86396 Columns: 126── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (59): STATENAME, MONTHNAME, HOURNAME, MINUTENAME, RUR_URBNAME, FUNC_SYSNAME, HARM_EVNAME, MAN_COLLNAME, SCH_BUSNAME, MAK...
dbl (67): STATE, ST_CASE, VE_FORMS, VEH_NO, PER_NO, STR_VEH, COUNTY, DAY, DAYNAME, MONTH, HOUR, MINUTE, RUR_URB, FUNC_SYS, H...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dru = read_csv('drugs.csv')
Rows: 107141 Columns: 9── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): STATENAME, DRUGSPECNAME, DRUGRESNAME
dbl (6): STATE, ST_CASE, VEH_NO, PER_NO, DRUGSPEC, DRUGRES
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#intersting in the people driving the vehicle that crashed
drivers = per %>% filter(PER_TYP == 1)
names(acc)
[1] "STATE" "STATENAME" "ST_CASE" "VE_TOTAL" "VE_FORMS" "PVH_INVL" "PEDS" "PERNOTMVIT"
[9] "PERMVIT" "PERSONS" "COUNTY" "COUNTYNAME" "CITY" "CITYNAME" "DAY" "DAYNAME"
[17] "MONTH" "MONTHNAME" "YEAR" "DAY_WEEK" "DAY_WEEKNAME" "HOUR" "HOURNAME" "MINUTE"
[25] "MINUTENAME" "NHS" "NHSNAME" "ROUTE" "ROUTENAME" "TWAY_ID" "TWAY_ID2" "RUR_URB"
[33] "RUR_URBNAME" "FUNC_SYS" "FUNC_SYSNAME" "RD_OWNER" "RD_OWNERNAME" "MILEPT" "MILEPTNAME" "LATITUDE"
[41] "LATITUDENAME" "LONGITUD" "LONGITUDNAME" "SP_JUR" "SP_JURNAME" "HARM_EV" "HARM_EVNAME" "MAN_COLL"
[49] "MAN_COLLNAME" "RELJCT1" "RELJCT1NAME" "RELJCT2" "RELJCT2NAME" "TYP_INT" "TYP_INTNAME" "WRK_ZONE"
[57] "WRK_ZONENAME" "REL_ROAD" "REL_ROADNAME" "LGT_COND" "LGT_CONDNAME" "WEATHER" "WEATHERNAME" "SCH_BUS"
[65] "SCH_BUSNAME" "RAIL" "RAILNAME" "NOT_HOUR" "NOT_HOURNAME" "NOT_MIN" "NOT_MINNAME" "ARR_HOUR"
[73] "ARR_HOURNAME" "ARR_MIN" "ARR_MINNAME" "HOSP_HR" "HOSP_HRNAME" "HOSP_MN" "HOSP_MNNAME" "FATALS"
[81] "DRUNK_DR"
names(drivers)
[1] "STATE" "STATENAME" "ST_CASE" "VE_FORMS" "VEH_NO" "PER_NO"
[7] "STR_VEH" "COUNTY" "DAY" "DAYNAME" "MONTH" "MONTHNAME"
[13] "HOUR" "HOURNAME" "MINUTE" "MINUTENAME" "RUR_URB" "RUR_URBNAME"
[19] "FUNC_SYS" "FUNC_SYSNAME" "HARM_EV" "HARM_EVNAME" "MAN_COLL" "MAN_COLLNAME"
[25] "SCH_BUS" "SCH_BUSNAME" "MAKE" "MAKENAME" "MAK_MOD" "MAK_MODNAME"
[31] "BODY_TYP" "BODY_TYPNAME" "MOD_YEAR" "MOD_YEARNAME" "TOW_VEH" "TOW_VEHNAME"
[37] "SPEC_USE" "SPEC_USENAME" "EMER_USE" "EMER_USENAME" "ROLLOVER" "ROLLOVERNAME"
[43] "IMPACT1" "IMPACT1NAME" "FIRE_EXP" "FIRE_EXPNAME" "AGE" "AGENAME"
[49] "SEX" "SEXNAME" "PER_TYP" "PER_TYPNAME" "INJ_SEV" "INJ_SEVNAME"
[55] "SEAT_POS" "SEAT_POSNAME" "REST_USE" "REST_USENAME" "REST_MIS" "REST_MISNAME"
[61] "AIR_BAG" "AIR_BAGNAME" "EJECTION" "EJECTIONNAME" "EJ_PATH" "EJ_PATHNAME"
[67] "EXTRICAT" "EXTRICATNAME" "DRINKING" "DRINKINGNAME" "ALC_DET" "ALC_DETNAME"
[73] "ALC_STATUS" "ALC_STATUSNAME" "ATST_TYP" "ATST_TYPNAME" "ALC_RES" "ALC_RESNAME"
[79] "DRUGS" "DRUGSNAME" "DRUG_DET" "DRUG_DETNAME" "DSTATUS" "DSTATUSNAME"
[85] "HOSPITAL" "HOSPITALNAME" "DOA" "DOANAME" "DEATH_DA" "DEATH_DANAME"
[91] "DEATH_MO" "DEATH_MONAME" "DEATH_YR" "DEATH_YRNAME" "DEATH_HR" "DEATH_HRNAME"
[97] "DEATH_MN" "DEATH_MNNAME" "DEATH_TM" "DEATH_TMNAME" "LAG_HRS" "LAG_HRSNAME"
[103] "LAG_MINS" "LAG_MINSNAME" "WORK_INJ" "WORK_INJNAME" "HISPANIC" "HISPANICNAME"
[109] "LOCATION" "LOCATIONNAME" "HELM_USE" "HELM_USENAME" "HELM_MIS" "HELM_MISNAME"
[115] "VPICMAKE" "VPICMAKENAME" "VPICMODEL" "VPICMODELNAME" "VPICBODYCLASS" "VPICBODYCLASSNAME"
[121] "ICFINALBODY" "ICFINALBODYNAME" "GVWR_FROM" "GVWR_FROMNAME" "GVWR_TO" "GVWR_TONAME"
#finding repeat cols in both datasets
names(drivers) %in% names(acc)
[1] TRUE TRUE TRUE TRUE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[21] TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[41] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[81] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[101] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[121] FALSE FALSE FALSE FALSE FALSE FALSE
#these cols are not in accidents.csv
drivers[,5:7]
drivers[,27:126]
distinct_driver_cols = bind_cols(drivers[,5:7],drivers[,27:126])
#this is now the driver ds without distinct features to add to accidents
drivers2 = bind_cols(drivers[,3], distinct_driver_cols)
#dataset now to refine for model
drivers3 = acc %>% left_join(drivers2, by = "ST_CASE")
Lot of factors here, this is going to be a lot of dummy cols….
Principal Component Analysis:
cumsum(props)
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9
0.0001500427 0.1431667880 0.2232873766 0.2782261465 0.3213756296 0.3612389162 0.3987044462 0.4353003572 0.4707669069
PC10 PC11 PC12 PC13 PC14 PC15 PC16 PC17 PC18
0.5059050363 0.5401248930 0.5687142488 0.5956727465 0.6185837349 0.6374895021 0.6556157802 0.6728084051 0.6885255547
PC19 PC20 PC21 PC22 PC23 PC24 PC25 PC26 PC27
0.7033484114 0.7171387594 0.7296819948 0.7416325190 0.7523932332 0.7625296562 0.7725167509 0.7819136938 0.7909017697
PC28 PC29 PC30 PC31 PC32 PC33 PC34 PC35 PC36
0.7995598583 0.8073577193 0.8146069139 0.8214276282 0.8280596715 0.8342958999 0.8401674341 0.8459823229 0.8513135728
PC37 PC38 PC39 PC40 PC41 PC42 PC43 PC44 PC45
0.8564121033 0.8614536043 0.8662394932 0.8707793927 0.8752722774 0.8796651555 0.8839637895 0.8880922183 0.8921378332
PC46 PC47 PC48 PC49 PC50 PC51 PC52 PC53 PC54
0.8960667982 0.8998219356 0.9034600320 0.9068151090 0.9100575460 0.9131014111 0.9159906790 0.9186425835 0.9210339267
PC55 PC56 PC57 PC58 PC59 PC60 PC61 PC62 PC63
0.9233810540 0.9256303563 0.9277665607 0.9297469255 0.9316676015 0.9333695803 0.9349878010 0.9365050013 0.9379877817
PC64 PC65 PC66 PC67 PC68 PC69 PC70 PC71 PC72
0.9394226272 0.9408318114 0.9421961259 0.9435448974 0.9448630486 0.9461644490 0.9474356440 0.9487017511 0.9499076379
PC73 PC74 PC75 PC76 PC77 PC78 PC79 PC80 PC81
0.9511074070 0.9522468886 0.9533642651 0.9544551541 0.9555358975 0.9565847566 0.9575771518 0.9585519866 0.9595011775
PC82 PC83 PC84 PC85 PC86 PC87 PC88 PC89 PC90
0.9604186592 0.9613180401 0.9621582063 0.9629829745 0.9637987630 0.9645763353 0.9653450526 0.9660911825 0.9668227349
PC91 PC92 PC93 PC94 PC95 PC96 PC97 PC98 PC99
0.9675427651 0.9682509876 0.9689447837 0.9696340044 0.9702806130 0.9709223297 0.9715505273 0.9721590672 0.9727365164
PC100 PC101 PC102 PC103 PC104 PC105 PC106 PC107 PC108
0.9732859088 0.9738042771 0.9743181852 0.9748297675 0.9753377452 0.9758176052 0.9762914836 0.9767561988 0.9772070074
PC109 PC110 PC111 PC112 PC113 PC114 PC115 PC116 PC117
0.9776478759 0.9780788145 0.9785010877 0.9789203205 0.9793280388 0.9797322449 0.9801275550 0.9805025839 0.9808717775
PC118 PC119 PC120 PC121 PC122 PC123 PC124 PC125 PC126
0.9812395585 0.9816048504 0.9819650169 0.9823243163 0.9826806856 0.9830255390 0.9833684233 0.9837051304 0.9840398084
PC127 PC128 PC129 PC130 PC131 PC132 PC133 PC134 PC135
0.9843742729 0.9847038583 0.9850306352 0.9853557478 0.9856780357 0.9859959881 0.9863092547 0.9866201474 0.9869267589
PC136 PC137 PC138 PC139 PC140 PC141 PC142 PC143 PC144
0.9872322371 0.9875329712 0.9878297121 0.9881252463 0.9884123730 0.9886985552 0.9889819360 0.9892526220 0.9895191125
PC145 PC146 PC147 PC148 PC149 PC150 PC151 PC152 PC153
0.9897836722 0.9900433447 0.9902941660 0.9905417760 0.9907866811 0.9910279374 0.9912630250 0.9914949060 0.9917206197
PC154 PC155 PC156 PC157 PC158 PC159 PC160 PC161 PC162
0.9919419817 0.9921621098 0.9923731905 0.9925801358 0.9927858935 0.9929894487 0.9931788959 0.9933603388 0.9935392166
PC163 PC164 PC165 PC166 PC167 PC168 PC169 PC170 PC171
0.9937146824 0.9938897792 0.9940642568 0.9942382799 0.9944110889 0.9945814224 0.9947445581 0.9949035570 0.9950608596
PC172 PC173 PC174 PC175 PC176 PC177 PC178 PC179 PC180
0.9952148245 0.9953656439 0.9955091489 0.9956466655 0.9957800636 0.9959033026 0.9960243425 0.9961442337 0.9962631671
PC181 PC182 PC183 PC184 PC185 PC186 PC187 PC188 PC189
0.9963807713 0.9964882243 0.9965853591 0.9966794682 0.9967676707 0.9968539499 0.9969399275 0.9970228638 0.9971051074
PC190 PC191 PC192 PC193 PC194 PC195 PC196 PC197 PC198
0.9971870756 0.9972678132 0.9973460632 0.9974221417 0.9974932395 0.9975642890 0.9976338295 0.9977021278 0.9977685575
PC199 PC200 PC201 PC202 PC203 PC204 PC205 PC206 PC207
0.9978336567 0.9978979042 0.9979604748 0.9980223830 0.9980842269 0.9981449831 0.9982052235 0.9982651987 0.9983231641
PC208 PC209 PC210 PC211 PC212 PC213 PC214 PC215 PC216
0.9983777852 0.9984307166 0.9984816129 0.9985302683 0.9985759611 0.9986194884 0.9986613320 0.9987023719 0.9987423820
PC217 PC218 PC219 PC220 PC221 PC222 PC223 PC224 PC225
0.9987811742 0.9988186361 0.9988555328 0.9988921528 0.9989280890 0.9989636503 0.9989973330 0.9990306637 0.9990628448
PC226 PC227 PC228 PC229 PC230 PC231 PC232 PC233 PC234
0.9990946659 0.9991253015 0.9991554802 0.9991852897 0.9992135746 0.9992415741 0.9992689394 0.9992955855 0.9993216094
PC235 PC236 PC237 PC238 PC239 PC240 PC241 PC242 PC243
0.9993453960 0.9993688998 0.9993911552 0.9994133871 0.9994342200 0.9994550197 0.9994753229 0.9994955029 0.9995155020
PC244 PC245 PC246 PC247 PC248 PC249 PC250 PC251 PC252
0.9995352514 0.9995547711 0.9995740691 0.9995926899 0.9996109675 0.9996282056 0.9996435456 0.9996585681 0.9996733984
PC253 PC254 PC255 PC256 PC257 PC258 PC259 PC260 PC261
0.9996881248 0.9997027604 0.9997159838 0.9997284219 0.9997408147 0.9997530965 0.9997647341 0.9997762616 0.9997876141
PC262 PC263 PC264 PC265 PC266 PC267 PC268 PC269 PC270
0.9997987090 0.9998092176 0.9998197069 0.9998300993 0.9998399328 0.9998492621 0.9998585663 0.9998670101 0.9998752668
PC271 PC272 PC273 PC274 PC275 PC276 PC277 PC278 PC279
0.9998833985 0.9998913267 0.9998990832 0.9999066325 0.9999130414 0.9999192441 0.9999254149 0.9999314143 0.9999373022
PC280 PC281 PC282 PC283 PC284 PC285 PC286 PC287 PC288
0.9999428380 0.9999480424 0.9999532272 0.9999580470 0.9999626384 0.9999665781 0.9999703903 0.9999741888 0.9999777488
PC289 PC290 PC291 PC292 PC293 PC294 PC295 PC296 PC297
0.9999810128 0.9999838447 0.9999865259 0.9999891437 0.9999904713 0.9999917671 0.9999930556 0.9999942979 0.9999950556
PC298 PC299 PC300 PC301 PC302 PC303 PC304 PC305 PC306
0.9999958017 0.9999964662 0.9999971216 0.9999977673 0.9999984086 0.9999990456 0.9999996774 1.0000000000 1.0000000000
PC307 PC308 PC309 PC310 PC311 PC312 PC313 PC314 PC315
1.0000000000 1.0000000000 1.0000000000 1.0000000000 1.0000000000 1.0000000000 1.0000000000 1.0000000000 1.0000000000
PC316 PC317 PC318 PC319 PC320 PC321 PC322 PC323 PC324
1.0000000000 1.0000000000 1.0000000000 1.0000000000 1.0000000000 1.0000000000 1.0000000000 1.0000000000 1.0000000000
PC325 PC326 PC327 PC328 PC329 PC330 PC331 PC332 PC333
1.0000000000 1.0000000000 1.0000000000 1.0000000000 1.0000000000 1.0000000000 1.0000000000 1.0000000000 1.0000000000
PC334 PC335 PC336 PC337 PC338 PC339 PC340 PC341 PC342
1.0000000000 1.0000000000 1.0000000000 1.0000000000 1.0000000000 1.0000000000 1.0000000000 1.0000000000 1.0000000000
PC343 PC344 PC345 PC346 PC347 PC348 PC349 PC350 PC351
1.0000000000 1.0000000000 1.0000000000 1.0000000000 1.0000000000 1.0000000000 1.0000000000 1.0000000000 1.0000000000
PC352 PC353 PC354
1.0000000000 1.0000000000 1.0000000000